Set up

setwd("/home/Data/Genotype/SzCausal_202211/Summary202301/")
source("../helper.R") #helper functions and library load
source("../notearsMClosses.R")
source("../utils.R")
library(readxl)
library(dplyr)
library(ggpubr)
library(reshape2)
library(ggplot2)
set.seed(101)

Dara pre-processing

1. Read new data

Read protein data

# proteomic data
synaptosome = read_xlsx("../oneDriveData/LMER-REGRESSION/RESIDUALS+DX_SYNAPTOSOME-PROTEIN-no-pH.xlsx")
phospho = read_xlsx("../oneDriveData/LMER-REGRESSION/RESIDUALS+DX_PHOSPHO-PEPTIDE-2022-11-08.xlsx")
homogenate = read_xlsx("../oneDriveData/LMER-REGRESSION/RESIDUALS+DX_HOMOGENATE-PROTEIN.xlsx")
colnames(synaptosome)[1]="X"
colnames(phospho)[1]="X"
colnames(homogenate)[1]="X"
synaptosome=as.data.frame(synaptosome)
phospho=as.data.frame(phospho)
homogenate=as.data.frame(homogenate)

Read spine density data

# spine density
spine.densL3 = read_xlsx("../oneDriveData/spine_density/spine_L3_SML_mean_medianNorm_forVas.xlsx")
spine.densL5 = read_xlsx("../oneDriveData/spine_density/spine_L5_SML_mean_medianNorm_20220604.xlsx")

Read phenotype data

# pheno
pheno = read_xlsx("../oneDriveData/metadata/All_Subject_Demographics.xlsx")
pheno = as.data.frame(pheno)
rownames(pheno) = paste0(c("S"), pheno$HU)

Read phenotype data and ID map

# mapping IDs
# phosphopeptide_to_phosphosite
phosphoMap_LMER = read.csv("../oneDriveData/LMER-REGRESSION/VAS_LMER-REGRESSION-PHOSPHO-SITE_2022-11-17.csv")
imputed_phos_mapped = readRDS("../savedRDS/imputed_phos_mapped_Vas.RDS")
# uniprot map
protMap_phos = read_xlsx("../oneDriveData/metadata/20220509_uniprot_map_legacy_phos.xlsx")
protMap_hom = read_xlsx("../oneDriveData/metadata/20220920_uniprot_map_legacy_hom.xlsx")
protMap_syn = read_xlsx("../oneDriveData/metadata/20221003_uniprot_map_legacy_syn.xlsx")

2. Process spine density data

# combine spine density data
colnames(spine.densL3) = c("patient", "L3.spineS", "L3.spineM", "L3.spineL")
colnames(spine.densL5) = c("patient", "L5.spineS", "L5.spineM", "L5.spineL")
spine.dens = merge(spine.densL3, spine.densL5, by="patient")
spine.dens$patient = paste0(c("S"), spine.dens$patient)
rownames(spine.dens) = spine.dens$patient

Do average after t scale

# t-scaled spine.dens
spine.dens = tscale(t(spine.dens[,-1]))
# get averaged spine density
spine.dens = rbind(spine.dens,
                 (spine.dens[1,]+spine.dens[4,])/2,
                 (spine.dens[2,]+spine.dens[5,])/2,
                 (spine.dens[3,]+spine.dens[6,])/2)
rownames(spine.dens)[7:9] = c("Avg.spineS", "Avg.spineM", "Avg.spineL")
spine.dens[,1:5]
##                   S537        S567       S621        S659        S665
## L3.spineS   0.79071598  1.70373906  1.0379846  0.40711751 -0.35239506
## L3.spineM   0.74341847  1.21889312 -0.2756114 -0.05084451 -0.36374846
## L3.spineL   0.16157153 -0.03360054 -1.7370161 -0.39026201 -0.36374524
## L5.spineS   0.54341023  0.92139354  1.1607254 -0.18480922  0.26402584
## L5.spineM   0.12056845  0.18182508 -0.4756850  0.40036962 -0.01857248
## L5.spineL  -0.35458365 -0.24950081 -0.3205690  1.93167993  0.68511677
## Avg.spineS  0.66706311  1.31256630  1.0993550  0.11115414 -0.04418461
## Avg.spineM  0.43199346  0.70035910 -0.3756482  0.17476255 -0.19116047
## Avg.spineL -0.09650606 -0.14155068 -1.0287926  0.77070896  0.16068576

3. Process protein data

Look at protein data

dim(phospho)
## [1] 8604   85
phospho[1:5,1:5]
##                                      X       S1543       S1247      S1240
## 1 [K].RLsLESEGAGEGAAASPELSALEEAFRR.[F]  0.08914847  0.06069769 -0.3763341
## 2        [R].QQFTVSSGEsPPLSAGNIYQk.[R]  0.08978403 -0.18648350  0.1182183
## 3              [R].GLYDGPVcEVSVtPk.[T] -0.25367412 -0.44800185  0.1993910
## 4         [R].TGVLLPSSPEAEVPGkLsPk.[Q] -0.18580943  0.25834698  0.4435583
## 5     [R].DVmSDETNNEEtESPSQEFVNITk.[Y]  0.06637524 -0.11895118 -0.3151305
##         S659
## 1  0.1991965
## 2  0.5391897
## 3 -0.3116207
## 4 -0.1365558
## 5  0.2138412
dim(homogenate)
## [1] 6154   85
homogenate[1:5,1:5]
##            X       S1543       S1247      S1240        S659
## 1 A0A075B6I9  1.20958964 -0.20179116  0.2348509 -1.04283998
## 2 A0A075B767  0.32676616  0.40828776  0.3972039 -0.63299760
## 3 A0A087WSZ9 -0.02240467  0.09451659  0.1754651 -0.02535614
## 4 A0A087WW87  1.24859795 -0.08112229 -0.8601148 -0.66167317
## 5 A0A087X1C5  0.01549401 -0.04796556 -0.0296112  0.21076040
dim(synaptosome)
## [1] 4273   71
synaptosome[1:5,1:5]
##            X       S1524     S1542        S659       S1256
## 1 A0A087WW87 -0.07364627 0.2011440 -0.01602870  0.05803797
## 2 A0A0J9YX94 -0.07654129 0.1100552  0.01038487  0.36509269
## 3     A0AVF1 -0.01842234 0.0125747  0.17291865 -0.19032530
## 4     A0AVT1  0.28611163 0.4236436 -0.26265143 -0.36867685
## 5     A0FGR8 -0.32771033 0.3617452  0.22083443 -0.01426039

Map protein to Entry name

# map protein to gene
protMap_phos$`Entry name` = gsub("_.*", "", protMap_phos$`Entry name`)
protMap_hom$`Entry Name` = gsub("_.*", "", protMap_hom$`Entry Name`)
protMap_syn$`Entry Name` = gsub("_.*", "", protMap_syn$`Entry Name`)

Create Uniprot to Entry name map

hom = data.frame("EntryName" = protMap_hom[protMap_hom$From%in%homogenate$X,]$`Entry Name`,
                 "X" = protMap_hom[protMap_hom$From%in%homogenate$X,]$From)
syn = data.frame("EntryName" = protMap_syn[protMap_syn$From%in%synaptosome$X,]$`Entry Name`,
                 "X" = protMap_syn[protMap_syn$From%in%synaptosome$X,]$From)
Uniprot_Entry_conv = rbind(hom, syn)
Uniprot_Entry_conv = Uniprot_Entry_conv[!duplicated(Uniprot_Entry_conv), ]
head(Uniprot_Entry_conv)
##   EntryName      X
## 1      PLEC Q15149
## 2     DYHC1 Q14204
## 3     SPTN1 Q13813
## 4     SPTB2 Q01082
## 5      ANK2 Q01484
## 6     MACF1 Q9UPN3

Map homogenate and synaptosome IDs

iim = match(homogenate$X, Uniprot_Entry_conv$X)
sum(is.na(iim))
## [1] 1
homogenate[is.na(iim),]
##           X     S1543      S1247      S1240       S659      S930      S857
## 1101 P0DN79 0.1213453 -0.1068871 0.01900842 -0.2782531 -0.183471 0.2590462
##          S1455       S988       S621      S1686      S1307      S10024
## 1101 0.2738417 0.09690885 -0.2400725 0.04792216 0.07524866 -0.05255184
##          S1047      S933        S700      S625      S1480     S1263       S567
## 1101 0.1375628 0.2031501 -0.01041883 0.1243739 0.05355879 0.2949723 -0.1522442
##            S537      S1374      S904        S686       S727      S829
## 1101 -0.1018258 -0.2401632 0.2778762 0.005830551 -0.1000433 0.2265546
##            S1488      S1222       S1579      S1386      S1420      S1454
## 1101 -0.03472352 0.06512113 -0.03607005 0.02531974 0.01057712 0.02052511
##            S685       S1105     S852       S722     S1086       S630      S566
## 1101 0.07248064 -0.08739716 0.115066 0.03907439 0.2112698 -0.2046825 0.1642356
##           S1317      S1361      S1196      S1211     S10005      S1256
## 1101 0.02711624 0.04556534 -0.1300543 0.06370251 0.02369779 0.07975556
##            S1314     S1372      S1581      S739     S1088      S587        S806
## 1101 -0.01773702 0.1784291 0.01911988 0.2730042 0.1866938 0.1962267 -0.01975071
##           S665       S681       S640      S1326      S1453       S818      S917
## 1101 0.2309043 -0.1628133 0.04867391 -0.5196283 -0.0905203 -0.4174154 0.4013591
##          S1391      S1506     S1384       S1712   S1099     S10023       S1067
## 1101 0.3448422 -0.0045312 0.1825578 -0.03122271 0.01367 -0.0830969 -0.08956527
##           S1296      S1201       S1189       S1092     S1010      S871
## 1101 0.07860413 -0.1192308 -0.04048022 -0.04209686 0.1794047 0.2049523
##              S878       S1524     S1542     S1555      S1649      S1268
## 1101 -0.009398474 -0.03464149 0.3536867 0.1820754 -0.2746762 -0.1230837
##           S1230     S1284     S1188       S822      S787        S1119
## 1101 -0.1250359 0.1289821 0.2044734 0.06100962 -0.283646 -0.008278493
##           S1173
## 1101 0.09637928
iim = match(synaptosome$X, Uniprot_Entry_conv$X)
sum(is.na(iim))
## [1] 1
synaptosome[is.na(iim),]
##          X       S1524     S1542        S659     S1256    S10005      S722
## 783 P0DN79 -0.05277195 0.2277988 -0.02293654 0.1215893 0.1946774 0.5274595
##           S852       S818      S917      S1240     S1247       S1317  S1361
## 783 -0.1190994 -0.4102457 0.9094391 0.07329765 0.4754033 -0.07325104 0.1841
##         S1372     S1222      S1488       S1455      S625       S700     S1307
## 783 0.2533631 0.7574052 0.05695838 -0.03196725 0.3854604 0.04219118 0.2423072
##         S10024     S1230      S1268     S1026      S1555       S1649      S933
## 783 0.04874504 0.8768703 -0.3786701 0.2509153 -0.1995572 -0.08781356 0.5643208
##         S1047     S1119      S1173      S546      S587     S1314      S1270
## 783 0.1431283 0.1201258 -0.2608134 0.4885759 0.2459213 0.6522106 -0.2168916
##         S1579        S787      S822     S1384     S1712      S621        S988
## 783 0.1690741 -0.06982843 0.1290167 0.1520726 0.4581422 0.2114598 0.005760471
##          S566      S1196     S1211        S685     S1105      S640       S681
## 783 0.9669385 -0.5352001 0.6442835 -0.06483736 0.7956093 0.8212378 -0.3953751
##         S686       S1543    S10026      S537      S567     S1296     S1341
## 783 0.154504 -0.08148204 0.1499792 0.6596398 0.1020894 0.3519617 0.5240332
##          S1466       S1263      S1480      S1391     S1506     S1284       S665
## 783 -0.4561494 -0.04124201 0.03518731 0.07048346 0.7399287 -0.120382 -0.3514289
##          S806       S871         S878     S1189      S1255    S10020       S739
## 783 0.4980392 -0.1028951 8.869842e-05 0.5231688 0.03303334 0.9487006 -0.2180877
##         S1088
## 783 0.6556996

Add protein P0DN79 to convert table

Uniprot_Entry_conv = rbind(Uniprot_Entry_conv, c("CBSL", "P0DN79"))
iim=match(homogenate$X, Uniprot_Entry_conv$X)
rownames(homogenate) = as.character(Uniprot_Entry_conv$EntryName[iim])

iim=match(synaptosome$X, Uniprot_Entry_conv$X)
rownames(synaptosome) = Uniprot_Entry_conv$EntryName[iim]
synaptosome$X = NULL
homogenate$X = NULL

Deal with phospho data

phospho$prot_site = imputed_phos_mapped$prot_site

For identical prot_site go to LMER results and pick one with min(q-val) to keep

phos_LMER = read_xlsx("../oneDriveData/LMER-REGRESSION/LMER-REGRESSION-PHOSPHO-PEPTIDE_2022-11-07.xlsx")
phos_LMER = as.data.frame(phos_LMER)
iim = match(phospho$X, phos_LMER$peptide)
phospho$qval = phos_LMER$q[iim]
tmp = phospho[!is.na(phospho$prot_site),]
tmp = tmp[order(tmp$prot_site, tmp$qval, decreasing = F),]
tmp = tmp[!duplicated(tmp$prot_site),]
phospho = tmp
rownames(phospho) = phospho$prot_site
phospho$X = NULL
phospho$prot_site = NULL
phospho$qval = NULL

Phospho site map

load("phospho_site_map.RData")
rownames(phospho) = phospho_site_abbr$rowname

Naming of phospho peptides: p + protein ID + phosphosite index

head(rownames(phospho))
## [1] "pA0FGR8" "pA0JNW5" "pA1IGU5" "pA1Z1Q3" "pA2A288" "pA2A2V5"

4. tscale on all protein data before combination

Look at data distribution

boxplot(homogenate)

boxplot(synaptosome)

boxplot(phospho)

boxplot(t(homogenate[1:50,]))

boxplot(t(synaptosome[1:50,]))

boxplot(t(phospho[1:50,]))

T-scale

# tscale
homogenate = tscale(homogenate)
phospho = tscale(phospho)
synaptosome = tscale(synaptosome)

Look at data distribution after t-scale

boxplot(t(homogenate[1:50,]))

boxplot(t(synaptosome[1:50,]))

boxplot(t(phospho[1:50,]))

5. Combine all proteins

rownames(synaptosome) = paste0(rownames(synaptosome), c('.syn'))

comCol = commonColumns(phospho, synaptosome)
intersectProt = rbind(phospho[,comCol], homogenate[,comCol], synaptosome[,comCol])

comCol = commonColumns(intersectProt, spine.dens)
intersectProt = rbind(phospho[,comCol], homogenate[,comCol], synaptosome[,comCol])
boxplot(intersectProt)

boxplot(t(intersectProt[c(1:20, 8000:8020, 17000:17020),]))

6. Save the protein results

saveRDS(homogenate, "savedRDS/homogenate_LMERtscaled.RDS")
saveRDS(synaptosome, "savedRDS/synaptosome_LMERtscaled.RDS")
saveRDS(phospho, "savedRDS/phospho_LMERtscaled.RDS")
saveRDS(intersectProt, "savedRDS/intersectProt_LMERtscaled.RDS")

Look at spine density and disease prediction

1. Surrogate spine density prediction from limma regression

protein = intersectProt
spine.dens = spine.dens[,colnames(intersectProt)]
pheno = pheno[colnames(protein),]
multi_Avg=data.frame("patient"=colnames(spine.dens))
multi_L3=data.frame("patient"=colnames(spine.dens))
multi_L5=data.frame("patient"=colnames(spine.dens))

Run cv.glmnet (with keep=T) 100 times and average all the minimum cm results.

for (i in 1:100) {
    # print(paste0("working on: ", i, "..."))
    gres = cv.glmnet(y=spine.dens[7,], x=t(protein), family="gaussian", type.measure="mse", alpha=0.9, keep = T)
    multi_Avg=cbind(multi_Avg, gres$fit.preval[,gres$cvm == min(gres$cvm)])
    
    gres = cv.glmnet(y=spine.dens[1,], x=t(protein), family="gaussian", type.measure="mse", alpha=0.9, keep = T)
    multi_L3=cbind(multi_L3, c(gres$fit.preval[,gres$cvm == min(gres$cvm)]))
    
    gres = cv.glmnet(y=spine.dens[4,], x=t(protein), family="gaussian", type.measure="mse", alpha=0.9, keep = T)
    multi_L5=cbind(multi_L5, c(gres$fit.preval[,gres$cvm == min(gres$cvm)]))
}

Use mean from 100 cv.glmnet results with min(cm) to get surrogate prediction

surrogateAvg = apply(multi_Avg[,-1], 1, mean)
surrogateL3 = apply(multi_L3[,-1], 1, mean)
surrogateL5 = apply(multi_L5[,-1], 1, mean)

spinePredict = rbind("surrogateAvg" = surrogateAvg, "surrogateL3" = surrogateL3, "surrogateL5" = surrogateL5)

2. look at prediction results

PredictScatter <- function(data, ylim = c(-0.8,0.8), xlim = c(-2.3, 2), title=NULL, xlab=NULL, ylab=NULL,...) {
    p = ggscatter(data, "real", "surrogate_prediction",
               ylim=ylim, xlim=xlim,
               add = "reg.line", conf.int = TRUE, # Add confidence interval
               cor.coef = TRUE, # Add correlation coefficient. see ?stat_cor
               cor.coeff.args = list(method = "pearson", label.x = -2.3, label.y = 0.6, label.sep = "\n", size=3),
               title = title,
               xlab = xlab,
               ylab = ylab)
    p
}

L3 vs L5 correlation shows the best theoretical possible prediction

l3_l5 = data.frame("surrogate_prediction" = spine.dens[1,], "real" = spine.dens[4,])
p = PredictScatter(l3_l5, ylim = c(-2,2),
               title = "L3 vs L5",
               xlab = "L5", ylab = "L3")
p

Prediction results

avg = data.frame("surrogate_prediction" = surrogateAvg, "real"=spine.dens[7,])
l3 = data.frame("surrogate_prediction" = surrogateL3, "real"=spine.dens[1,])
l5 = data.frame("surrogate_prediction" = surrogateL5, "real"=spine.dens[4,])
l3p_l5 = data.frame("surrogate_prediction" = surrogateL3, "real"=spine.dens[4,])
l5p_l3 = data.frame("surrogate_prediction" = surrogateL5, "real"=spine.dens[1,])
l3p_l5p = data.frame("surrogate_prediction" = surrogateL3, "real"=surrogateL5)
p1 = PredictScatter(avg, title = "Avg.predict vs Avg.real",
                    xlab = "Avg.real", ylab = "Avg.surrogate_prediction")

p2 = PredictScatter(l3, title = "L3.predict vs L3.real",
                    xlab = "L3.real", ylab = "L3.surrogate_prediction")

p3 = PredictScatter(l5, title = "L5.predict vs L5.real",
                    xlab = "L5.real", ylab = "L5.surrogate_prediction")

p4 = PredictScatter(l3p_l5, title = "L3.predict vs L5.real",
                    xlab = "L5.real", ylab = "L3.surrogate_prediction")

p5 = PredictScatter(l5p_l3, title = "L5.predict vs L3.real",
                    xlab = "L3.real", ylab = "L5.surrogate_prediction")

p6 = PredictScatter(l3p_l5p, title = "L3.predict vs L5.predict",
                    xlab = "L5.surrogate_prediction", ylab = "L3.surrogate_prediction")

Predict vs real spine density

ggarrange(p1, p2, p3, p4, p5, p6, ncol = 3, nrow = 2)

3. Predict vs real spine density in diagnosis prediction

tscale on predicted spine density

spineData = rbind(spine.dens, tscale(spinePredict))
df = melt(as.matrix(spineData))
colnames(df) = c("type", "subj", "value")
df$pheno = pheno[as.character(df$subj),]$Group #since the subject id is actually a number if may get autoconverted
p = ggboxplot(df, x = "pheno", y = "value", facet.by = "type")+
    stat_compare_means(method = "t.test", label.x = 1, label.y = 4) +
    stat_compare_means(aes(label = ..p.signif..), method = "t.test", label.y = 3, label.x = 1.4)
facet(p,facet.by = "type", ncol=3)

correlation between spine.dens and diagnosis

cor.result = data.frame()
for(i in 1:nrow(spineData)) {
    cor=cor.test(as.numeric(spineData[i,]), as.numeric(pheno[colnames(spineData),]$Group=="C"))
    tmp=t(melt(unlist(cor)))
    cor.result=rbind(cor.result, tmp)
}
rownames(cor.result)=rownames(spineData)
corR=as.data.frame(cor.result)
corR$statistic.t=as.numeric(corR$statistic.t)
corR$estimate.cor=as.numeric(corR$estimate.cor)
corR$p.value=as.numeric(corR$p.value)
ggscatter(corR, "estimate.cor", "statistic.t", label = rownames(corR), repel = T, title = "Spine density and disease diagnosis")

generate LV from protein data decomposition

1. create LVs

# create LVs using the protein dataset
prot.data=matrix(unlist(protein),ncol = ncol(protein))
rownames(prot.data) = rownames(protein)
colnames(prot.data) = colnames(protein)
res = simpleDecomp(tscale(prot.data), k=20, adaptive.frac = 0.01)
## ****
## Computing SVD
## [1] "L1 is set to 39.1454705695919"
## [1] "L2 is set to 117.436411708776"
## Init
## converged at  iteration 67
dim(res$Z)
## [1] 17143    20

2. look at the distribution of LVs

#we should see some with very low variance (drop out)
boxplot(t(res$B), las=2)

plot(sort(bvar<-apply(res$B,1,var)), log="y")

#1e-4 is a good cutoff
#look at the Z, how does the pattern in Z depend on adaptive.frac?
p = pheatmap(getTopEachColumn(res$Z[, bvar>1e-4],top = 20), main="cutoff bvar>0.01")

feature selections

1. patient and control groups

SzP = rownames(pheno[pheno$Group=='Sz',])
Ctrl = rownames(pheno[pheno$Group=='C',])

2. focal variables: diagnosis (Sz vs Ctrl)

2.1 Feature selection without spine density prediction

limma

limres = fitLimma(protein, as.factor(pheno$Group), "Sz-C")
topTable(limres, n=30)
##               logFC      AveExpr         t      P.Value    adj.P.Val        B
## 1433T.syn  1.610003 -0.024210535  6.421753 1.449222e-10 8.064091e-07 13.56665
## KTN1.syn   1.602663 -0.013868570  6.393741 1.739276e-10 8.064091e-07 13.39442
## FN3K.syn   1.595121 -0.003397722  6.364181 2.106803e-10 8.064091e-07 13.21349
## SNX2.syn   1.590265 -0.042487072  6.343754 2.404065e-10 8.064091e-07 13.08893
## TRIM2.syn  1.585402 -0.020206905  6.322845 2.750686e-10 8.064091e-07 12.96185
## COF1.syn   1.579901 -0.040310663  6.299658 3.192230e-10 8.064091e-07 12.82140
## 1433F.syn  1.569705 -0.008944295  6.261592 4.071452e-10 8.064091e-07 12.59194
## PTPA.syn   1.565659 -0.049253269  6.244392 4.542476e-10 8.064091e-07 12.48871
## XPO7.syn   1.562210 -0.001096893  6.233101 4.880178e-10 8.064091e-07 12.42109
## 1433E.syn  1.558736 -0.046139704  6.217201 5.397609e-10 8.064091e-07 12.32608
## CYRIA.syn  1.556076 -0.025105927  6.208205 5.713676e-10 8.064091e-07 12.27243
## pP61764.3 -1.555988  0.023285102 -6.204376 5.853632e-10 8.064091e-07 12.24962
## GDS1.syn   1.553987 -0.012940026  6.197453 6.115218e-10 8.064091e-07 12.20841
## PCY2.syn   1.547864 -0.044222959  6.173350 7.117982e-10 8.715969e-07 12.06529
## HNRPD.syn  1.544260 -0.010018544  6.158077 7.834622e-10 8.737768e-07 11.97489
## MARE3.syn  1.541914 -0.006965985  6.151682 8.155182e-10 8.737768e-07 11.93710
## PURA2.syn  1.533612 -0.064650863  6.115419 1.022975e-09 9.995281e-07 11.72356
## 1433G.syn  1.531810 -0.030298143  6.110442 1.055193e-09 9.995281e-07 11.69435
## 1433Z.syn  1.530272 -0.037679811  6.102625 1.107801e-09 9.995281e-07 11.64852
## CSN5.syn   1.516221 -0.012447993  6.045193 1.580994e-09 1.355149e-06 11.31356
## FAF1.syn   1.513511 -0.072829802  6.036399 1.669015e-09 1.362472e-06 11.26255
## GSLG1.syn -1.510924  0.026154310 -6.026970 1.768706e-09 1.378224e-06 11.20794
## PPME1.syn  1.505743 -0.045685001  6.005468 2.018229e-09 1.421423e-06 11.08371
## HNRPK.syn  1.504202  0.005252015  6.002862 2.050706e-09 1.421423e-06 11.06868
## pP55327.1 -1.505663  0.099833704 -6.001105 2.072891e-09 1.421423e-06 11.05856
## HGS.syn    1.502475 -0.026823754  5.991686 2.195870e-09 1.447838e-06 11.00431
## GUAA.syn   1.500551 -0.051628875  5.984622 2.292722e-09 1.455709e-06 10.96370
## SNX6.syn   1.498201 -0.033460567  5.974907 2.432777e-09 1.489467e-06 10.90790
## DP13A.syn  1.495068 -0.045357310  5.964633 2.589925e-09 1.531003e-06 10.84900
## DCTN4.syn  1.490574 -0.030042992  5.943971 2.936520e-09 1.649118e-06 10.73083

run multi-elastic nets

resGN = runGnetMulti(t(protein), y=pheno$Group=="Sz", nfolds=3, bootstrap = T, alpha = 0.9)
## family is set to binomial

choose featrues to label

# choose features to label
toLabel = rownames(protein)[which(resGN$features>75 | abs(limres$t)>6)]
length(toLabel) #do not plot more than 30 labels, they will be subsampled
## [1] 33

scatter plot for feature selection

myggscatter(resGN$features, limres$t, line = F, label = rownames(protein),
            xlab="Number of models chosen in 100 elastic net regressions",
            ylab="t-statistic value",
            label.select = toLabel,
            title="Focal variable: diagnosis (Sz-Control)",
            subtitle = "models > 60 or abs(t) > 6")

2.2 feature selection with spine density prediction

Limma has no difference compared with spinePredict excluded

data = rbind(protein, tscale(spinePredict))
limres.predictS = fitLimma(data, as.factor(pheno$Group), "Sz-C")
topTable(limres.predictS, n=30)
##               logFC      AveExpr         t      P.Value    adj.P.Val        B
## 1433T.syn  1.610003 -0.024210535  6.422513 1.446083e-10 8.049500e-07 13.56879
## KTN1.syn   1.602663 -0.013868570  6.394546 1.734950e-10 8.049500e-07 13.39686
## FN3K.syn   1.595121 -0.003397722  6.365003 2.101279e-10 8.049500e-07 13.21605
## SNX2.syn   1.590265 -0.042487072  6.344533 2.398388e-10 8.049500e-07 13.09125
## TRIM2.syn  1.585402 -0.020206905  6.323562 2.745217e-10 8.049500e-07 12.96381
## COF1.syn   1.579901 -0.040310663  6.300325 3.186849e-10 8.049500e-07 12.82308
## 1433F.syn  1.569705 -0.008944295  6.262355 4.061960e-10 8.049500e-07 12.59423
## PTPA.syn   1.565659 -0.049253269  6.245112 4.533061e-10 8.049500e-07 12.49075
## XPO7.syn   1.562210 -0.001096893  6.233915 4.867119e-10 8.049500e-07 12.42371
## 1433E.syn  1.558736 -0.046139704  6.217934 5.385851e-10 8.049500e-07 12.32823
## CYRIA.syn  1.556076 -0.025105927  6.208999 5.698980e-10 8.049500e-07 12.27495
## pP61764.3 -1.555988  0.023285102 -6.205035 5.843531e-10 8.049500e-07 12.25134
## GDS1.syn   1.553987 -0.012940026  6.198152 6.103085e-10 8.049500e-07 12.21037
## PCY2.syn   1.547864 -0.044222959  6.174059 7.103291e-10 8.699501e-07 12.06733
## HNRPD.syn  1.544260 -0.010018544  6.158749 7.820146e-10 8.716890e-07 11.97672
## MARE3.syn  1.541914 -0.006965985  6.152468 8.134272e-10 8.716890e-07 11.93961
## PURA2.syn  1.533612 -0.064650863  6.116079 1.021127e-09 9.977682e-07 11.72535
## 1433G.syn  1.531810 -0.030298143  6.111187 1.052725e-09 9.977682e-07 11.69664
## 1433Z.syn  1.530272 -0.037679811  6.103304 1.105657e-09 9.977682e-07 11.65043
## CSN5.syn   1.516221 -0.012447993  6.045812 1.578450e-09 1.353205e-06 11.31517
## FAF1.syn   1.513511 -0.072829802  6.037095 1.665530e-09 1.359865e-06 11.26461
## GSLG1.syn -1.510924  0.026154310 -6.027699 1.764641e-09 1.375297e-06 11.21019
## PPME1.syn  1.505743 -0.045685001  6.006162 2.013989e-09 1.420040e-06 11.08578
## HNRPK.syn  1.504202  0.005252015  6.003692 2.044685e-09 1.420040e-06 11.07154
## pP55327.1 -1.505663  0.099833704 -6.001642 2.070513e-09 1.420040e-06 11.05973
## HGS.syn    1.502475 -0.026823754  5.992349 2.191644e-09 1.445305e-06 11.00622
## GUAA.syn   1.500551 -0.051628875  5.985309 2.287980e-09 1.452952e-06 10.96574
## SNX6.syn   1.498201 -0.033460567  5.975578 2.427940e-09 1.486766e-06 10.90986
## DP13A.syn  1.495068 -0.045357310  5.965390 2.583427e-09 1.527429e-06 10.85145
## DCTN4.syn  1.490574 -0.030042992  5.944619 2.931035e-09 1.647096e-06 10.73268
resGN.predictS = runGnetMulti(t(data), y=pheno$Group=="Sz", nfolds=3, bootstrap = T, alpha = 0.9)
## family is set to binomial
# hist(resGN.predictS$vals, main = "resGN.predictS") + title(sub = "alpha=0.9")
# hist(resGN.predictS$features[resGN.predictS$features > 0], main = "resGN_features") + title(sub="alpha=0.9")

choose featrues to label

# choose features to label
toLabel.predictS = rownames(data)[which(resGN.predictS$features>75 | abs(limres.predictS$t)>6)]
length(toLabel.predictS) #do not plot more than 30 labels, they will be subsampled
## [1] 33

scatter plot for feature selection

myggscatter(resGN.predictS$features, limres.predictS$t, line = F, label = rownames(data),
            xlab="Number of models chosen in 100 elastic net regressions",
            ylab="t-statistic value",
            label.select = toLabel.predictS,
            title="Focal variable: diagnosis (Sz-Control)",
            subtitle = "models > 60 or abs(t) > 6")
## Warning in myggscatter(resGN.predictS$features, limres.predictS$t, line = F, :
## More than 30 labels were subsampled to 30
## Warning: ggrepel: 16 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

3. focal variables: Avg.spineS

limma

spine.dens = as.matrix(spine.dens)
modSpine = model.matrix(~1+spine.dens[7,])
colnames(modSpine)[2] = "spineD"
limresSpine = fitLimma(protein, modSpine , "spineD - 0")
topTable(limresSpine, n=30)
##                 logFC      AveExpr         t      P.Value   adj.P.Val        B
## pQ8IVF5.4  -0.8934745 -0.076267606 -5.353717 8.618617e-08 0.001477489 7.431116
## DPYL1.syn  -0.8570338 -0.053946060 -5.135363 2.816563e-07 0.002414217 6.361533
## pQ92688     0.8425958 -0.038933652  5.048851 4.445602e-07 0.002540365 5.950076
## pQ9H706     0.8184724 -0.110583926  4.904303 9.377557e-07 0.004018987 5.278201
## OGA.syn    -0.8040669 -0.029200734 -4.817985 1.450385e-06 0.004972789 4.886289
## PURA2.syn  -0.7841306 -0.064650863 -4.698526 2.620829e-06 0.007488145 4.355392
## pQ9UQB3.1  -0.7648836 -0.049361315 -4.583198 4.579786e-06 0.008358399 3.855492
## SODM.syn    0.7638344 -0.028518932  4.576911 4.719532e-06 0.008358399 3.828599
## TCPA.syn   -0.7627312 -0.062298015 -4.570300 4.870884e-06 0.008358399 3.800360
## VP26C.syn  -0.7626967 -0.038831394 -4.570094 4.875692e-06 0.008358399 3.799478
## 1433Z.syn  -0.7530230 -0.037679811 -4.512129 6.418801e-06 0.008679003 3.553627
## 1433S.syn  -0.7514992 -0.083566757 -4.502998 6.700955e-06 0.008679003 3.515186
## CNDP2.syn  -0.7510871 -0.069975733 -4.500529 6.779295e-06 0.008679003 3.504801
## TCPE.syn   -0.7485420 -0.052082128 -4.485278 7.282757e-06 0.008679003 3.440806
## pP21359     0.7427877  0.009170296  4.450799 8.556154e-06 0.008679003 3.296917
## pP23528.3   0.7415122  0.040656564  4.443156 8.865912e-06 0.008679003 3.265173
## pQ96PR1.1   0.7406776 -0.049388739  4.438155 9.074386e-06 0.008679003 3.244430
## DCTN4.syn  -0.7389771 -0.030042992 -4.427965 9.513712e-06 0.008679003 3.202241
## AGRB1.syn   0.7368582  0.008209034  4.415269 1.008960e-05 0.008679003 3.149807
## pQ6IQ19     0.7367035 -0.008968839  4.414342 1.013291e-05 0.008679003 3.145986
## 1433E.syn  -0.7317216 -0.046139704 -4.384490 1.162705e-05 0.008679003 3.023323
## TCPQ.syn   -0.7306619 -0.026529317 -4.378141 1.197091e-05 0.008679003 2.997341
## pP51608.5   0.7302414  0.023459948  4.375621 1.211006e-05 0.008679003 2.987038
## NCOA7.syn  -0.7266821 -0.050324376 -4.354294 1.335109e-05 0.008679003 2.900091
## PTPA.syn   -0.7255760 -0.049253269 -4.347666 1.376088e-05 0.008679003 2.873156
## TCPD.syn   -0.7234032 -0.016869695 -4.334646 1.460101e-05 0.008679003 2.820367
## pO14525.1   0.7230085  0.012618937  4.332281 1.475878e-05 0.008679003 2.810794
## LRP1.syn    0.7229549 -0.023836803  4.331960 1.478031e-05 0.008679003 2.809496
## GSLG1.syn   0.7228363  0.026154310  4.331250 1.482810e-05 0.008679003 2.806621
## pP46821.87  0.7215738  0.054415971  4.323685 1.534607e-05 0.008679003 2.776044

run multi-elastic net

resGN.spine = runGnetMulti(t(protein), y=spine.dens[7,], nfolds=3, bootstrap = T, alpha = 0.9)
## family is set to gaussian

selection criteria

# classification AUCs
# hist(as.numeric(resGN.spine$vals), main = "resGN.spine")+title(sub="alpha=0.9")
# number of times features are used
# hist(resGN.spine$features[resGN.spine$features > 0], main = "resGN.spine_features") + title(sub="alpha=0.9")

choose features to label

toLabel.Spine = rownames(protein)[which(resGN.spine$features>60 | abs(limresSpine$t)>4.4)]
length(toLabel.Spine) #do not plot more than 30 labels, they will be subsampled
## [1] 31

scatter plot

myggscatter(resGN.spine$features, limresSpine$t, line = F, label = rownames(protein),
            xlab="Number of models chosen in 100 elastic net regressions",
            ylab="t-statistic value",
            label.select = toLabel.Spine,
            title="Focal variable: Avr.SpineS",
            subtitle = "models > 60 or abs(t) > 4.4")
## Warning in myggscatter(resGN.spine$features, limresSpine$t, line = F, label =
## rownames(protein), : More than 30 labels were subsampled to 30
## Warning: ggrepel: 9 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

4. marginal assocaition for avg.spineS in Sz adn Ctrls

SzP = rownames(pheno[pheno$Group=='Sz',])
Ctrl = rownames(pheno[pheno$Group=='C',])
# marginal association for Avr.spineS in Sz and Controls
limresSz = fitLimma(protein[,SzP], modSpine[which(pheno$Group=="Sz"),], "spineD - 0")
limresCtrl = fitLimma(protein[,Ctrl], modSpine[which(pheno$Group=="C"),], "spineD - 0")
topTable(limresSz)
##               logFC     AveExpr        t      P.Value adj.P.Val         B
## pP17096.4 1.0756418  0.28145853 4.268223 2.321618e-05 0.3979950 -3.265292
## pQ92688   1.0077131 -0.44406577 4.012478 6.843059e-05 0.5865528 -3.425685
## pQ6IQ19   0.9676038 -0.37448605 3.803591 1.586395e-04 0.6436434 -3.550098
## pQ9H706   0.9581040 -0.41943871 3.789843 1.674420e-04 0.6436434 -3.558076
## pQ969T4   0.9530234 -0.33394129 3.734906 2.074311e-04 0.6436434 -3.589693
## RL37A.syn 0.9376995 -0.18918192 3.713557 2.252733e-04 0.6436434 -3.601867
## pQ15057   0.8826156 -0.03376813 3.481205 5.389212e-04 0.9094734 -3.730232
## pQ6H8Q1.6 0.8825834 -0.15424995 3.480261 5.407822e-04 0.9094734 -3.730738
## pP23468.1 0.8644651  0.03465700 3.432091 6.441229e-04 0.9094734 -3.756390
## pO15027.2 0.8484686 -0.18462014 3.335929 9.076863e-04 0.9094734 -3.806612
topTable(limresCtrl)
##                 logFC     AveExpr         t      P.Value adj.P.Val         B
## PPM1E.syn  -0.9458450 -0.34740985 -3.473051 0.0006771436 0.9999921 -3.961855
## pQ96IZ7     0.8914463 -0.07009488  3.386813 0.0009085221 0.9999921 -3.993812
## pQ15036.2   0.8970421  0.13330417  3.341981 0.0010562638 0.9999921 -4.010195
## pO75190.1  -0.8579486 -0.36272190 -3.267242 0.0013534502 0.9999921 -4.037149
## pQ9UQ35.87  0.8792725 -0.46504635  3.245957 0.0014513746 0.9999921 -4.044743
## HNRPC.syn  -0.8435109  0.05558890 -3.217678 0.0015916982 0.9999921 -4.054774
## pQ5T200.4   0.8420023 -0.07732842  3.160574 0.0019143067 0.9999921 -4.074829
## pP19634.4   0.8300463  0.48413324  3.153196 0.0019601525 0.9999921 -4.077400
## pQ09470.2   0.8230990  0.34378664  3.129157 0.0021166955 0.9999921 -4.085746
## pO60241.1  -0.8342739 -0.30105360 -3.094255 0.0023646789 0.9999921 -4.097777
resGN.spineSz = runGnetMulti(t(protein[,SzP]), y=spine.dens[7,SzP], nfolds=3, bootstrap = T, alpha = 0.9)
## family is set to gaussian
resGN.spineCtrl = runGnetMulti(t(protein[,Ctrl]), y=spine.dens[7,Ctrl], nfolds=3, bootstrap = T, alpha = 0.9)
## family is set to gaussian
# hist(resGN.spineSz$features[resGN.spineSz$features > 0], main = "resGN.spineSz_features") + title(sub="alpha=0.9")
# hist(resGN.spineCtrl$features[resGN.spineCtrl$features > 0], main = "resGN.spineCtrl_features") + title(sub="alpha=0.9")
toLabel.SpineSz = rownames(protein)[which(resGN.spineSz$features>5 | abs(limresSz$t)>3.5)]
length(toLabel.SpineSz) #do not plot more than 30 labels, they will be subsampled
## [1] 11
toLabel.SpineCtrl = rownames(protein)[which(resGN.spineCtrl$features>5 | abs(limresCtrl$t)>3)]
length(toLabel.SpineCtrl) #do not plot more than 30 labels, they will be subsampled
## [1] 20
myggscatter(resGN.spineSz$features, limresSz$t, line = F, label = rownames(protein),
            xlab="Number of models chosen in 100 elastic net regressions",
            ylab="t-statistic value",
            label.select = toLabel.SpineSz,
            title="Focal variable: Avr.SpineS, Sz group", subtitle = "models > 5 or abs(t) > 3.5")

myggscatter(resGN.spineCtrl$features, limresCtrl$t, line = F, label = rownames(protein),
            xlab="Number of models chosen in 100 elastic net regressions",
            ylab="t-statistic value",
            label.select = toLabel.SpineCtrl,
            title="Focal variable: Avr.SpineS, Ctrl group", subtitle = "models > 5 or abs(t) > 3")
## Warning: ggrepel: 5 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

toLabel.SpineMarginal = unique(c(toLabel.SpineSz, toLabel.SpineCtrl))

5. Spine vs diagnosis Interaction

# limma: nothing with FDR < 0.05
modSpineInt=model.matrix(~ as.numeric(pheno$Group=='Sz') + spine.dens[1,] + as.numeric(pheno$Group=='Sz'):spine.dens[1,])
colnames(modSpineInt)[4] = "SzSpine"
limresProtSpineInteraction=fitLimma(protein, modSpineInt , "SzSpine - 0")
hist(limresProtSpineInteraction$p.value)

topTable(limresProtSpineInteraction)
##                logFC     AveExpr         t      P.Value adj.P.Val         B
## pO14976    1.0941124 -0.11475539  3.533019 0.0004202093  0.999942 -4.058568
## pQ9C0D0.1 -1.0546524  0.08819297 -3.418221 0.0006425060  0.999942 -4.095722
## pQ9H307.2 -0.9811641 -0.02608153 -3.160182 0.0016004114  0.999942 -4.175194
## RAE2       0.9708722 -0.03278399  3.133518 0.0017524109  0.999942 -4.183047
## pQ9UKM9.2  0.9545776  0.04792047  3.080798 0.0020927219  0.999942 -4.198381
## pQ5T200.4 -0.9244881  0.09490526 -2.989077 0.0028324904  0.999942 -4.224451
## pQ96AG3.1 -0.9133800  0.14983950 -2.943205 0.0032859246  0.999942 -4.237198
## VAMP3.syn -0.9053292  0.02395728 -2.928646 0.0034424237  0.999942 -4.241177
## pQ15036.2 -0.8944883  0.09819100 -2.890277 0.0038907264  0.999942 -4.251665
## MADD.syn   0.8904018 -0.01244468  2.885051 0.0039539454  0.999942 -4.253026

causal graph construction

1. Sampling function

Returns a list of sampling results

  • default n = 20
  • no tears default lambda1 = 0.1
graphSampling <- function(protein, LV, spine, prot.use, pheno = NULL, spinePredict = NULL, n = 20, lambda1 = 0.1) {
    graph_list = list()
    for (i in 1:n) {
        prot.sub = unique(sample(prot.use, replace=T))
        nt.data = cbind(t(protein[prot.sub,]), t(LV), "Spine" = spine)
        if(!is.null(pheno)) {
            nt.data = cbind(nt.data, "Sz" = pheno)
        }
        if (!is.null(spinePredict)) {
            nt.data = cbind(nt.data, t(spinePredict))
        }
        ntres = notearsInterceptMultiLoss(nt.data, lambda1 = lambda1)
        graph = ntres$graph
        graph_list[[i]]=graph 
    }
    graph_list
}

2. graph construction function

Input graph list, out put all graph list

  • default edge threshold = 0.2 (top30 in each sampled subgrapgh)
  • default n = 20
  • default n_cutoff = 10
graphConstruction <- function (graph_list, name, thres = 0.2, n = 20, n_cutoff = 10) {
    l = length(name)
    newgraph = matrix(data=c(0.0), nrow = l, ncol = l)
    rownames(newgraph) = name
    colnames(newgraph) = rownames(newgraph)
    
    for (i in 1:l) {
        for (j in 1:l) {
            cnt = 0;
            sum = 0.0;
            row_name=name[i]
            col_name=name[j]
            for (k in 1:n) {
                namek = rownames(graph_list[[k]])
                if (row_name %in% namek & col_name %in% namek) {
                    if(abs(graph_list[[k]][row_name, col_name]) >= thres) {
                        cnt = cnt+1;
                        sum = sum + graph_list[[k]][row_name, col_name]
                    }
                }
            }
            if (cnt >= n_cutoff) {
                newgraph[i,j] = 1.0*sum/cnt
            }
        }
    }
    newgraph
}

3. Feature selection Focal variable: Diagnosis

Nodes

prot.use = toLabel
lv.use = colnames(res$Z)[which(colSums(res$Z[prot.use,])>0)]

Sample defalut n=20

SampleDx_Predict = graphSampling(protein = tscale(protein), LV = tscale(res$B[lv.use,]), spine = tscale(spine.dens)[7,], prot.use = prot.use,
                                 pheno = pheno$Group=="Sz", spinePredict = tscale(spinePredict))
SampleDx_NoSz_Predict = graphSampling(protein = tscale(protein), LV = tscale(res$B[lv.use,]), spine = tscale(spine.dens)[7,], prot.use = prot.use,
                                      spinePredict = tscale(spinePredict))
SampleDx = graphSampling(protein = tscale(protein), LV = tscale(res$B[lv.use,]), spine = tscale(spine.dens)[7,], prot.use = prot.use,
                         pheno = pheno$Group=="Sz")
SampleDx_NoSz = graphSampling(protein = tscale(protein), LV = tscale(res$B[lv.use,]), spine = tscale(spine.dens)[7,], prot.use = prot.use,)

look at top 20/30/50 edge cutoff

for (i in 1:20) {
    sorted = sort(abs(SampleDx_NoSz[[i]]), T)
    print(c(sorted[21], sorted[31], sorted[51]))
}
## [1] 0.3032011 0.2268166 0.1232914
## [1] 0.3266034 0.2183912 0.1374592
## [1] 0.2825062 0.2303501 0.1264078
## [1] 0.3343950 0.2158181 0.1257517
## [1] 0.3186735 0.1788678 0.1077261
## [1] 0.3351227 0.2354051 0.1430549
## [1] 0.2441629 0.1886909 0.1027224
## [1] 0.3399627 0.2510639 0.1311407
## [1] 0.2935994 0.2126151 0.1065618
## [1] 0.3243118 0.2506505 0.1532260
## [1] 0.3172415 0.2028005 0.1156485
## [1] 0.27082708 0.21342237 0.09029881
## [1] 0.3327649 0.2464871 0.1104643
## [1] 0.34912482 0.19763875 0.07772649
## [1] 0.3015028 0.2631653 0.1516571
## [1] 0.35446586 0.22910402 0.09652205
## [1] 0.2565801 0.2006097 0.1055853
## [1] 0.3330527 0.2187216 0.1304274
## [1] 0.29297826 0.19356645 0.08885504
## [1] 0.3509489 0.2311544 0.1365131

Construct new graph with default threshold=0.2

name = c(prot.use, lv.use, "Spine", "Sz", rownames(spinePredict))
GraphDx_Predict = graphConstruction(graph_list = SampleDx_Predict, name = name)

Szgroup = length
name = c(prot.use, lv.use, "Spine", rownames(spinePredict))
GraphDx_NoSz_Predict = graphConstruction(graph_list = SampleDx_NoSz_Predict, name = name)

name = c(prot.use, lv.use, "Spine", "Sz")
GraphDx = graphConstruction(graph_list = SampleDx, name = name)

name = c(prot.use, lv.use, "Spine")
GraphDx_NoSz = graphConstruction(graph_list = SampleDx_NoSz, name = name)

Visualize

# graph varType
SYN=length(grep(".syn", prot.use))
Phos=length(grep("p", prot.use))
HOM=length(prot.use)-SYN-Phos
Prot = c(rep("Phos", Phos), rep("Hom", HOM), rep("Syn", SYN))
lenLV = rep("LV",length(lv.use))

Focal variable: diagnosis

  • protein, LVs, and avg.spineS + Sz and predicted spines
gvarType = c(Prot, lenLV, "Spine", "Sz", rep("PredictSpine",3))
network_visualize(GraphDx_Predict, gvarType)

  • protein, LVs, and avg.spineS + predicted spines
gvarType = c(Prot, lenLV, "Spine", rep("PredictSpine",3))
network_visualize(GraphDx_NoSz_Predict, gvarType)

  • protein, LVs, and avg.spineS + Sz
gvarType = c(Prot, lenLV, "Spine", "Sz")
network_visualize(GraphDx, gvarType)

  • protein, LVs, and avg.spineS
gvarType = c(Prot, lenLV, "Spine")
network_visualize(GraphDx_NoSz, gvarType)

4. Feature selection Focal variable: Average spine density S

Nodes

prot.use = toLabel.Spine
lv.use = colnames(res$Z)[which(colSums(res$Z[prot.use,])>0)]

Sample defalut n=20

SampleSpine_Predict = graphSampling(protein = tscale(protein), LV = tscale(res$B[lv.use,]), spine = tscale(spine.dens)[7,], prot.use = prot.use,
                                 pheno = pheno$Group=="Sz", spinePredict = tscale(spinePredict))
SampleSpine_NoSz_Predict = graphSampling(protein = tscale(protein), LV = tscale(res$B[lv.use,]), spine = tscale(spine.dens)[7,], prot.use = prot.use,
                                      spinePredict = tscale(spinePredict))
SampleSpine = graphSampling(protein = tscale(protein), LV = tscale(res$B[lv.use,]), spine = tscale(spine.dens)[7,], prot.use = prot.use,
                         pheno = pheno$Group=="Sz")
SampleSpine_NoSz = graphSampling(protein = tscale(protein), LV = tscale(res$B[lv.use,]), spine = tscale(spine.dens)[7,], prot.use = prot.use)

look at top 20/30/50 edge cutoff

for (i in 1:20) {
    sorted = sort(abs(SampleSpine_NoSz[[i]]), T)
    print(c(sorted[21], sorted[31], sorted[51]))
}
## [1] 0.2530889 0.1907942 0.1145262
## [1] 0.2794521 0.2120853 0.1472086
## [1] 0.2312279 0.1662027 0.1150866
## [1] 0.2405788 0.1947410 0.1094901
## [1] 0.2825583 0.1963016 0.1136881
## [1] 0.26486286 0.17420982 0.09494001
## [1] 0.2472721 0.1916586 0.1171055
## [1] 0.2743022 0.2069258 0.1310184
## [1] 0.24072757 0.19809292 0.09993045
## [1] 0.23020913 0.17054136 0.09590431
## [1] 0.2698287 0.2279909 0.1359897
## [1] 0.2604606 0.1947551 0.1075155
## [1] 0.2711087 0.2143835 0.1029340
## [1] 0.2666014 0.2093371 0.1431590
## [1] 0.2390029 0.1861309 0.1050572
## [1] 0.2735049 0.2029109 0.1201743
## [1] 0.23541449 0.16127586 0.09253285
## [1] 0.2570886 0.1983001 0.1230010
## [1] 0.2577424 0.2255661 0.1581288
## [1] 0.2575156 0.1980991 0.1197271

Construct new graph with default threshold=0.2

name = c(prot.use, lv.use, "Spine", "Sz", rownames(spinePredict))
GraphSpine_Predict = graphConstruction(graph_list = SampleSpine_Predict, name = name)

name = c(prot.use, lv.use, "Spine", rownames(spinePredict))
GraphSpine_NoSz_Predict = graphConstruction(graph_list = SampleSpine_NoSz_Predict, name = name)

name = c(prot.use, lv.use, "Spine", "Sz")
GraphSpine = graphConstruction(graph_list = SampleSpine, name = name)

name = c(prot.use, lv.use, "Spine")
GraphSpine_NoSz = graphConstruction(graph_list = SampleSpine_NoSz, name = name)

Visualize

# graph varType
SYN=length(grep(".syn", prot.use))
Phos=length(grep("p", prot.use))
HOM=length(prot.use)-SYN-Phos
Prot = c(rep("Phos", Phos), rep("Hom", HOM), rep("Syn", SYN))
lenLV = rep("LV",length(lv.use))

focal variable: avg.spineS

  • protein, LVs, and avg.spineS + Sz and predicted spines
gvarType = c(Prot, lenLV, "Spine", "Sz", rep("PredictSpine",3))
network_visualize(GraphSpine_Predict, gvarType)

  • protein, LVs, and avg.spineS + predicted spines
gvarType = c(Prot, lenLV, "Spine", rep("PredictSpine",3))
network_visualize(GraphSpine_NoSz_Predict, gvarType)

  • protein, LVs, and avg.spineS + Sz
gvarType = c(Prot, lenLV, "Spine", "Sz")
network_visualize(GraphSpine, gvarType)

  • protein, LVs, and avg.spineS
gvarType = c(Prot, lenLV, "Spine")
network_visualize(GraphSpine_NoSz, gvarType)

5. Focal variable: Avg.spineS, group = Sz or group = C

Nodes

prot.use = toLabel.SpineMarginal
lv.use = colnames(res$Z)[which(colSums(res$Z[prot.use,])>0)]

Sample defalut n=20

SampleSpineM_Predict = graphSampling(protein = tscale(protein), LV = tscale(res$B[lv.use,]), spine = tscale(spine.dens)[7,], prot.use = prot.use,
                                 pheno = pheno$Group=="Sz", spinePredict = tscale(spinePredict))
SampleSpineM_NoSz_Predict = graphSampling(protein = tscale(protein), LV = tscale(res$B[lv.use,]), spine = tscale(spine.dens)[7,], prot.use = prot.use,
                                      spinePredict = tscale(spinePredict))
SampleSpineM = graphSampling(protein = tscale(protein), LV = tscale(res$B[lv.use,]), spine = tscale(spine.dens)[7,], prot.use = prot.use,
                         pheno = pheno$Group=="Sz")
SampleSpineM_NoSz = graphSampling(protein = tscale(protein), LV = tscale(res$B[lv.use,]), spine = tscale(spine.dens)[7,], prot.use = prot.use)

look at top 20/30/50 edge cutoff

for (i in 1:20) {
    sorted = sort(abs(SampleSpineM_Predict[[i]]), T)
    print(c(sorted[21], sorted[31], sorted[51]))
}
## [1] 0.2710647 0.2166053 0.1507901
## [1] 0.2671103 0.2329998 0.1625837
## [1] 0.2907356 0.2266787 0.1432652
## [1] 0.2801489 0.2179566 0.1444579
## [1] 0.2937074 0.2256596 0.1734228
## [1] 0.2685831 0.2062389 0.1389371
## [1] 0.2595864 0.2258823 0.1768834
## [1] 0.2650412 0.2156483 0.1381464
## [1] 0.2652130 0.2031971 0.1466063
## [1] 0.2656629 0.2225641 0.1589334
## [1] 0.2925721 0.2398683 0.1612636
## [1] 0.2606099 0.2054587 0.1529708
## [1] 0.2883908 0.2121162 0.1383166
## [1] 0.2740144 0.2300112 0.1457400
## [1] 0.3019276 0.2534426 0.1795049
## [1] 0.2928640 0.2381717 0.1733600
## [1] 0.2584094 0.2106583 0.1463333
## [1] 0.2575325 0.2315021 0.1478743
## [1] 0.2556917 0.2059857 0.1361789
## [1] 0.2720559 0.2399823 0.1721586

Construct new graph with default threshold=0.2

name = c(prot.use, lv.use, "Spine", "Sz", rownames(spinePredict))
GraphSpineM_Predict = graphConstruction(graph_list = SampleSpineM_Predict, name = name)

name = c(prot.use, lv.use, "Spine", rownames(spinePredict))
GraphSpineM_NoSz_Predict = graphConstruction(graph_list = SampleSpineM_NoSz_Predict, name = name)

name = c(prot.use, lv.use, "Spine", "Sz")
GraphSpineM = graphConstruction(graph_list = SampleSpineM, name = name)

name = c(prot.use, lv.use, "Spine")
GraphSpineM_NoSz = graphConstruction(graph_list = SampleSpineM_NoSz, name = name)

Visualize

# graph varType
Prot = c(rep("spineSz", length(toLabel.SpineSz)), rep("SpineCtrl", length(toLabel.SpineCtrl)))
lenLV = rep("LV",length(lv.use))

focal variable: avg.spineS, in Sz or Ctrl group

  • protein, LVs, and avg.spineS + Sz and predicted spines
gvarType = c(Prot, lenLV, "Spine", "Sz", rep("PredictSpine",3))
network_visualize(GraphSpineM_Predict, gvarType)

  • protein, LVs, and avg.spineS + predicted spines
gvarType = c(Prot, lenLV, "Spine", rep("PredictSpine",3))
network_visualize(GraphSpineM_NoSz_Predict, gvarType)

  • protein, LVs, and avg.spineS + Sz
gvarType = c(Prot, lenLV, "Spine", "Sz")
network_visualize(GraphSpineM, gvarType)

  • protein, LVs, and avg.spineS
gvarType = c(Prot, lenLV, "Spine")
network_visualize(GraphSpineM_NoSz, gvarType)

relationship between LV and pH

t_res=tscale(res$B)
LV_pH = data.frame()
for (i in 1:nrow(t_res)) {
    cor=cor.test(t_res[i,], pheno$pH)
    a=t(melt(unlist(cor)))
    LV_pH = rbind(LV_pH, a)
}
rownames(LV_pH)=rownames(t_res)
LV_pH=as.data.frame(LV_pH)
LV_pH$p.value=as.numeric(LV_pH$p.value)
LV_pH$estimate.cor=as.numeric(LV_pH$estimate.cor)
ggscatter(LV_pH, "estimate.cor", "p.value", label = rownames(LV_pH), repel = T,
          xlim = c(-0.5, 0.5),
          ylab = "log10 scaled p-val",
          xlab = "Pearson correlation") +
    scale_y_log10(breaks = c(1e-4, 1e-3, 1e-2, 0.05, 0.1, 0.25, 0.5)) +
    labs(title = "Correlation between LVs and pH")